library(tidyverse)

df_main <- read.csv("main_df.csv")

eng_glm_w_int <- df_main %>%
  filter(GENDER == "F" | GENDER == "M", 
         FAMILY_INCOME > 9999, 
         !is.na(FAMILY_INCOME),
         US_CITIZENSHIP_STATUS == "U.S. Citizen" | US_CITIZENSHIP_STATUS == "Perm Resident",
         MAJOR_DISCIPLINE != "Other") %>%
  mutate(gender_f = ifelse(GENDER == "F",1,0),
         gpa_perc = percent_rank(UC_CALCULATED_GPA),
         sat_math_perc = percent_rank(RSAT_MATH_SCORE),
         first_gen = ifelse(PARENT_HIGHEST_EDUCATION_LEVEL == "Four-Year College/University Graduate" |
                              PARENT_HIGHEST_EDUCATION_LEVEL == "Postgraduate Study", 0,1),
         read_perc = percent_rank(RSAT_EBRW),
         major = factor(ifelse(MAJOR_DISCIPLINE == "Engineering",1,0))) %>%
  glm(major ~  sat_math_perc + 
        gender_f +
        gpa_perc +
        read_perc +
        gender_f*sat_math_perc, 
      family = "binomial", data = .)

bio_glm_w_int <- df_main %>%
  filter(GENDER == "F" | GENDER == "M", 
         FAMILY_INCOME > 9999, 
         !is.na(FAMILY_INCOME),
         US_CITIZENSHIP_STATUS == "U.S. Citizen" | US_CITIZENSHIP_STATUS == "Perm Resident",
         MAJOR_DISCIPLINE != "Other") %>%
  mutate(gender_f = ifelse(GENDER == "F",1,0),
         gpa_perc = percent_rank(UC_CALCULATED_GPA),
         sat_math_perc = percent_rank(RSAT_MATH_SCORE),
         first_gen = ifelse(PARENT_HIGHEST_EDUCATION_LEVEL == "Four-Year College/University Graduate" |
                              PARENT_HIGHEST_EDUCATION_LEVEL == "Postgraduate Study", 0,1),
         read_perc = percent_rank(RSAT_EBRW),
         major = factor(ifelse(MAJOR_DISCIPLINE == "Biological Science",1,0))) %>%
  glm(major ~  sat_math_perc + 
        gender_f +
        gpa_perc +
        read_perc +
        gender_f*sat_math_perc, 
      family = "binomial", data = .)


eng_preds <- data.frame(math_perc = eng_glm_w_int$data$sat_math_perc, 
                                preds = predict(eng_glm_w_int, type = "response"),
                                Gender = eng_glm_w_int$data$GENDER)

bio_preds <- data.frame(math_perc = bio_glm_w_int$data$sat_math_perc, 
                                preds = predict(bio_glm_w_int, type = "response"),
                                Gender = bio_glm_w_int$data$GENDER)


pdf("bio_prob_plot_w_int.pdf")
#bio_preds %>%
#  mutate(Gender = gender) %>%
ggplot(bio_preds, aes(x = math_perc, y = preds)) +
  geom_smooth(aes(color = Gender)) +
  geom_point(aes(color = Gender), size = .1, alpha = .4, show.legend = FALSE) +
  xlab("SAT Math Percentile") +
  ylab("Predicted Probability") +
  scale_y_continuous(breaks = seq(from = 0,
                                  by = .05)) +
  labs(title = "Biology") +
  theme_bw() +
  guides(color=guide_legend(override.aes=list(fill=NA)))
dev.off()


pdf("eng_prob_plot_w_int.pdf")
#eng_preds %>%
#  mutate(Gender = gender) %>%
ggplot(eng_preds, aes(x = math_perc, y = preds)) +
  geom_smooth(aes(color = Gender)) +
  geom_point(aes(color = Gender), size = .1, alpha = .4, show.legend = FALSE) +
  xlab("SAT Math Percentile") +
  ylab("Predicted Probability") +
  scale_y_continuous(breaks = seq(from = 0,
                                  by = .05)) +
  theme_bw() +
  labs(title = "Engineering") +
  guides(color=guide_legend(override.aes=list(fill=NA)))
dev.off()



